# importing dataset for Question 1 and Question 2
tips_data<-read.csv(file="/Users/vikramg/Desktop/Data_Analytics/Assignment_4/tips.csv", header=TRUE, sep=",")
data<-read.csv(file="/Users/vikramg/Desktop/Data_Analytics/Assignment_4/tips_tailored.csv", header=TRUE, sep=",")
rmse = function(m, o){
sqrt(mean((m - o)^2,na.rm = TRUE))
}
Two forms of missing data: Missing completely at random (MCAR):This form exists when the missing values are randomly distributed across all observations. Missing at random (MAR): the missing values are not randomly distributed across observations but are distributed within one or more sub-samples.
Issue regarding missing data: If the missing values are not handled properly by the researcher, then he/she may end up drawing an inaccurate inference about the data. Due to improper handling, the result obtained by the researcher will differ from ones where the missing values are present.
data_Odds_mean <- setNames(data.frame(ifelse(is.na(data$Odds), mean(data$Odds, na.rm=TRUE), data$Odds),data$Odds),c("Odds_mean","Odds"))
missing_mean = data_Odds_mean[is.na(data_Odds_mean$Odds),]
print("RMSE")
## [1] "RMSE"
print(rmse(tips_data$Odds,data_Odds_mean$Odds_mean))
## [1] 0.0779236
data_Odds_median <- setNames(data.frame(ifelse(is.na(data$Odds), median(data$Odds, na.rm=TRUE), data$Odds),data$Odds),c("Odds_median","Odds"))
missing_median <- data_Odds_median[is.na(data_Odds_median$Odds),]
print("RMSE")
## [1] "RMSE"
print(rmse(tips_data$Odds,data_Odds_median$Odds_median))
## [1] 0.05705324
library(dplyr)
horse_odds = subset(data,select=c("Horse","Odds"))
horse_mean = setNames(data.frame(horse_odds %>% group_by(Horse) %>% mutate(Odds = ifelse(is.na(Odds), mean(Odds, na.rm = TRUE), Odds)),data$Odds),c("Horse","Odds_horse_mean","Odds"))
missing_horse_mean = horse_mean[is.na(horse_mean$Odds),]
print("RMSE")
## [1] "RMSE"
print(rmse(tips_data$Odds,horse_mean$Odds_horse_mean))
## [1] 0.1551146
library(zoo)
data_Odds_interpolate = setNames(data.frame(na.approx(data$Odds),data$Odds),c('Odds_ip','Odds'))
missing_interpolate_Odds = data_Odds_interpolate[is.na(data_Odds_interpolate$Odds),]
print("RMSE")
## [1] "RMSE"
print(rmse(tips_data$Odds,data_Odds_interpolate$Odds_ip))
## [1] 0.0709432
library(mice)
#get a better understanding of the pattern of missing data
md.pattern(data)
## X UID ID Tipster Date Track Horse Bet.Type Result TipsterActive
## 38240 1 1 1 1 1 1 1 1 1 1
## 8 1 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0 0
## Predicted.Results Odds
## 38240 1 1 0
## 8 1 0 1
## 0 8 8
#visual representation
library(VIM)
aggr_plot <- aggr(data, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Odds 0.0002091613
## X 0.0000000000
## UID 0.0000000000
## ID 0.0000000000
## Tipster 0.0000000000
## Date 0.0000000000
## Track 0.0000000000
## Horse 0.0000000000
## Bet.Type 0.0000000000
## Result 0.0000000000
## TipsterActive 0.0000000000
## Predicted.Results 0.0000000000
#imputing missing data
imputedData <- mice(data,m=5,maxit=0,meth="pmm",seed=500)
complete_data = complete(imputedData)
summary(complete_data)
## X UID ID Tipster
## Min. : 1 Min. : 1 Min. : 1 Tipster X : 4383
## 1st Qu.: 9563 1st Qu.: 9563 1st Qu.: 318 Tipster E : 3700
## Median :19124 Median :19124 Median : 749 Tipster B1: 2497
## Mean :19124 Mean :19124 Mean :1013 Tipster A1: 2446
## 3rd Qu.:28686 3rd Qu.:28686 3rd Qu.:1419 Tipster D1: 2119
## Max. :38248 Max. :38248 Max. :4383 Tipster J : 1937
## (Other) :21166
## Date Track Horse
## 30-07-2016: 110 Kempton : 2197 Doctor Parkes : 26
## 31-10-2015: 106 Wolverhampton: 2113 Chookie Royale : 23
## 10-10-2015: 104 Lingfield : 2058 Oriental Relation: 21
## 26-12-2015: 104 Ascot : 1355 Sennockian Star : 21
## 09-01-2016: 101 SouthWell : 1326 Barnet Fair : 20
## 06-08-2016: 100 Newmarket : 1291 Silviniaco Conti : 19
## (Other) :37623 (Other) :27908 (Other) :38118
## Bet.Type Odds Result TipsterActive
## Each Way: 7830 Min. : 1.07 Lose:30565 Mode :logical
## Win :30417 1st Qu.: 5.00 Win : 7683 FALSE:13062
## win : 1 Median : 8.00 TRUE :25186
## Mean : 11.00
## 3rd Qu.: 13.00
## Max. :407.00
##
## Predicted.Results
## Lose:33074
## Win : 5174
##
##
##
##
##
hence,it is possible to impute the dataset using mice pacakage when the parameter maxit=0 if maxit > 0, it is not possible to impute the data because it shows below mentioned error Error: Cannot allocate vector of size 4.8 Gb.
xval = setNames(data.frame(c(1:8)),c("X"))
library(plotly)
lineplot <- plot_ly(data, x =xval$X) %>%
add_lines(y = ~missing_mean$Odds_mean, name = "Odds_mean",line = list(color = 'rgb(119, 244, 66)')) %>%
add_lines(y = ~missing_median$Odds_median, name = "Odds_median",line=list(colot='rgb(229, 168, 36)')) %>%
add_lines(y = ~missing_horse_mean$Odds_horse_mean, name = "Odds_particular_horse_mean",line=list(colot='rgb(77, 249, 223)')) %>%
add_lines(y = ~missing_interpolate_Odds$Odds_ip, name = "Interpolated_Odds") %>%
add_lines(y = ~tips_data[row.names(tips_data) %in% row.names(missing_mean),]$Odds, name = "Original_Odds") %>%
layout(
title = "Analysis of suitable method for missing values",
xaxis = list(rangeslider=list(type="date"),title="X"),
yaxis = list(title = "Odds"))
lineplot
Interpolation of the data points based on mean is the best possible method for filling in the missing values since the graph of original Odds and the interpolated_Odds almost represents the same and the RMSE value of interpolated_Odds is nearer to zero.
library(caret)
confusionMatrix(data$Predicted.Results,data$Result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Lose Win
## Lose 28670 4404
## Win 1895 3279
##
## Accuracy : 0.8353
## 95% CI : (0.8316, 0.839)
## No Information Rate : 0.7991
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4156
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9380
## Specificity : 0.4268
## Pos Pred Value : 0.8668
## Neg Pred Value : 0.6337
## Prevalence : 0.7991
## Detection Rate : 0.7496
## Detection Prevalence : 0.8647
## Balanced Accuracy : 0.6824
##
## 'Positive' Class : Lose
##
Accuracy
(TP + TN)/(TP + TN + FP + FN)
(28670 + 3279)/(28670 + 3279 + 1895 + 4404)
= 0.8353
Precision
TP/(TP + FP)
28670/(28670 + 4404)
= 0.8668
Recall
TP/(TP + FN)
28670/(28670 + 1895)
= 0.9380
Misclassification/Error Rate
(FP + FN)/(TP + TN + FP + FN)
(1895 + 4404)/(28670 + 3279 + 1895 + 4404)
ERR = 0.164
F1-score
(2*Recall*Precision)/(Recall + Precision)
(2 * 0.9380 * 0.8668)/(0.9380 + 0.8668)
= 1.626/1.8048 = 0.9
F score with beta=2
(1+ Beta^2) * (precision * recall) / ( (Beta^2 * precision) + recall )
(5 * 0.8668 * 0.938)/((4 * 0.8668)+0.938)
= 0.923
F score with beta=0.5
(1.25 * 0.8668 * 0.938)/((0.25 * 0.8668)+0.938)
= 0.88
The general formula involves a positive real beta so that F-score measures the effectiveness of retrieval with respect to a user who attaches beta times as much importance to recall as precision.
Predictions1 = c('Lose')
updated_data=cbind(data,Predictions1)
confusionMatrix(updated_data$Predictions1,updated_data$Result)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Lose Win
## Lose 30565 7683
## Win 0 0
##
## Accuracy : 0.7991
## 95% CI : (0.7951, 0.8031)
## No Information Rate : 0.7991
## P-Value [Acc > NIR] : 0.5031
##
## Kappa : 0
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7991
## Neg Pred Value : NaN
## Prevalence : 0.7991
## Detection Rate : 0.7991
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Lose
##
Accuracy
(TP + TN)/(TP + TN + FP + FN)
(30565 + 0)/(30565 + 7683 + 0 + 0)
= 0.7991
Precision
TP/(TP + FP)
30565 / (30565 + 7683)
= 0.7991
Recall
TP/(TP + FN)
30565/30565 + 0)
= 1
Misclassification/Error Rate
(FP + FN)/(TP + TN + FP + FN)
(7683 + 0)/(30565 + 7683 + 0 + 0)
= 0.20
F-score
(2*Recall*Precision)/(Recall + Precision)
= 0.889
This model has F1-score = 0.889 which is less than F1-score of previous model which is 0.92. We can conclude that this model is not better classifier.
Accuracy is the most intuitive performance measure and it is simply a ratio of correctly predicted observation to the total observations. One may think that, if we have high accuracy then our model is best. Accuracy works best if false positives and false negatives have similar cost. If we have an uneven class distribution, accuracy is not enough for the evaluation of a classification model.
Recall is the ratio of correctly predicted positive observations to the all observations in actual class. Precision is the ratio of correctly predicted positive observations to the total predicted positive observations. F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account. Since the F1-score is 0.92 which is high , we can conclude that the classifier used to predict the results is better than the classifier with low F1-score.
data2015<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/Data2015.csv",header=TRUE)
data2016<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/Data2016.csv",header=TRUE)
data2017<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/Data2017.csv",header=TRUE)
c<-intersect(data2017$Country,data2016$Country)
country<-intersect(c,data2015$Country)
train2015<-subset(data2015,Country %in% country)
train2016<-subset(data2016,Country %in% country)
train<-rbind(train2015,train2016)
test<-subset(data2017,Country %in% country)
RMSE = function(m, o){
sqrt(mean((m - o)^2))
}
model1<-lm(Happiness.Score ~ Economy..GDP.per.Capita. + Family + year + Health..Life.Expectancy.,data=train)
pred1<-predict.lm(model1,subset(test,TRUE,select=c(Economy..GDP.per.Capita., Family, year, Health..Life.Expectancy.)))
model2<-lm(Happiness.Score ~ Economy..GDP.per.Capita. + year + Health..Life.Expectancy.,data=train)
pred2<-predict.lm(model2,subset(test,TRUE,select=c(Economy..GDP.per.Capita., year, Health..Life.Expectancy.)))
RMSE(test$Happiness.Score,pred1)
## [1] 1.182017
RMSE(test$Happiness.Score,pred2)
## [1] 0.602027
library(caret)
library(InformationValue)
set.seed(100)
df<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/School_Data.dms",header=TRUE)
Train_data<-upSample(x=df,y=factor(df$Pass))
test<-read.csv("/Users/vikramg/Desktop/Data_Analytics/Assignment_4/Test - Sheet1.csv",header=TRUE)
colnames(test)[9]="popularity"
model1<-glm(Pass ~ Day1+Day2+Day3+Day4+Day5+Senior+Class_Prefect+Athlete+popularity,data=Train_data,family=binomial)
summary(model1)
##
## Call:
## glm(formula = Pass ~ Day1 + Day2 + Day3 + Day4 + Day5 + Senior +
## Class_Prefect + Athlete + popularity, family = binomial,
## data = Train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7918 -1.0254 0.1083 1.0326 2.2529
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.10186 0.13747 -0.741 0.45871
## Day1 -0.71755 0.22138 -3.241 0.00119 **
## Day2 0.08185 0.20529 0.399 0.69012
## Day3 -0.40919 0.21795 -1.877 0.06046 .
## Day4 1.57659 0.21019 7.501 6.33e-14 ***
## Day5 -0.17310 0.18941 -0.914 0.36077
## Senior -0.24305 0.24360 -0.998 0.31840
## Class_Prefect 0.13014 0.35991 0.362 0.71767
## Athlete 1.51479 0.61831 2.450 0.01429 *
## popularity -3.99837 1.19969 -3.333 0.00086 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 970.41 on 699 degrees of freedom
## Residual deviance: 865.41 on 690 degrees of freedom
## AIC: 885.41
##
## Number of Fisher Scoring iterations: 4
Looking at the P-value of Day1,Day3,Day4,Athlete and Popularity seems to be more significant features
model2<-glm(Pass ~Day1++Day3+Day4+Athlete+popularity,data=Train_data,family="binomial")
summary(model2)
##
## Call:
## glm(formula = Pass ~ Day1 + +Day3 + Day4 + Athlete + popularity,
## family = "binomial", data = Train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7551 -1.0452 0.1548 1.0240 2.3637
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1451 0.1317 -1.102 0.270624
## Day1 -0.6839 0.2014 -3.395 0.000685 ***
## Day3 -0.4112 0.2064 -1.992 0.046354 *
## Day4 1.5415 0.2030 7.594 3.11e-14 ***
## Athlete 1.5809 0.6106 2.589 0.009620 **
## popularity -4.1662 1.1099 -3.754 0.000174 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 970.41 on 699 degrees of freedom
## Residual deviance: 867.34 on 694 degrees of freedom
## AIC: 879.34
##
## Number of Fisher Scoring iterations: 4
Based on the AIC value Model2 comes out to be better
predict1<-predict(model1,test,type="response")
optCutOff1<- optimalCutoff(test$Pass, predict1)[1]
confusionMatrix(test$Pass, predict1, threshold = optCutOff1)
## 0 1
## 0 6 3
## 1 1 0
predict2<-predict(model2,test,type="response")
optCutOff2 <- optimalCutoff(test$Pass, predict2)[1]
confusionMatrix(test$Pass, predict2, threshold = optCutOff2)
## 0 1
## 0 6 3
## 1 1 0